In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import multiprocessing
from multiprocessing import Pool

num_cores = 8
iris = pd.DataFrame(sns.load_dataset('iris'))

In [7]:
# cpu count
multiprocessing.cpu_count()


Out[7]:
8

In [8]:
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_cores) # dataframe split
    pool = Pool(num_cores) # make Pool
    df = pd.concat(pool.map(func, df_split)) # pool에서 map function 후, concat
    pool.close() # 닫고
    pool.join() # join
    return df

In [9]:
def multiply_columns(data):
    data['length_of_word'] = data['species'].apply(lambda x: len(x))
    return data

In [10]:
%time normal_iris = multiply_columns(iris)


CPU times: user 1.41 ms, sys: 940 µs, total: 2.35 ms
Wall time: 8.52 ms

In [11]:
%time parallelize_iris = parallelize_dataframe(iris, multiply_columns)


CPU times: user 29 ms, sys: 32.8 ms, total: 61.8 ms
Wall time: 156 ms

In [ ]: